{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Fake Data Creator\n",
    "## Imports and Helper Functions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt\n",
    "from sklearn import datasets\n",
    "from random import uniform \n",
    "from faker import Factory,Faker\n",
    "from collections import defaultdict\n",
    "from sklearn.preprocessing import normalize,MinMaxScaler\n",
    "from pandas import to_datetime\n",
    "from random import randint\n",
    "%matplotlib inline\n",
    "fake = Faker()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def fixer(x1,mu1,std1,mu2):\n",
    "    '''\n",
    "    Fixes column values to be more realistic.\n",
    "    ''' \n",
    "    std2 = mu1/4.1\n",
    "    return ((x1-mu1)/(std1)) * (std2) + mu2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def realizer(data,minmax,rounder):\n",
    "    mm = MinMaxScaler((minmax[0],minmax[1]))\n",
    "    if rounder != '':\n",
    "        return np.round(mm.fit_transform(data.as_matrix().reshape(-1,1)) , 0)  \n",
    "    else:\n",
    "        return np.round(mm.fit_transform(data.as_matrix().reshape(-1,1)) , 2)   "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Dictionary of Faker Providers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "date_past = to_datetime('2006-01-01 11:42:52')\n",
    "date_present = to_datetime('2017-01-01 11:42:52')\n",
    "\n",
    "fake_codes = {}\n",
    "\n",
    "fake_codes['phone_number'] = lambda x: fake.phone_number()\n",
    "fake_codes['name'] = lambda x: fake.name()\n",
    "fake_codes['catch_phrase'] = lambda x: fake.catch_phrase()\n",
    "fake_codes['city'] = lambda x: fake.city()\n",
    "fake_codes['binary'] = lambda x: randint(0,1)\n",
    "fake_codes['randint1to10'] = lambda x: randint(1,10)\n",
    "fake_codes['country'] = lambda x: fake.country()\n",
    "fake_codes['timestamp_this_year'] = lambda x: fake.date_time_this_year()\n",
    "fake_codes['timestamp'] = lambda x: fake.date_time_between_dates(date_past,date_present)\n",
    "fake_codes['address'] = lambda x:  fake.address()\n",
    "fake_codes['lot'] = lambda x:  fake.bothify()\n",
    "fake_codes['AM or PM'] = lambda x:  fake.am_pm()\n",
    "fake_codes['browser_info'] = lambda x:  fake.user_agent()\n",
    "fake_codes['company'] = lambda x:  fake.company()\n",
    "fake_codes['cc_num'] = lambda x:  fake.credit_card_number()\n",
    "fake_codes['cc_exp'] = lambda x:  fake.credit_card_expire()\n",
    "fake_codes['cc_sec_code'] = lambda x:  fake.credit_card_security_code()\n",
    "fake_codes['cc_provider'] = lambda x:  fake.credit_card_provider()\n",
    "fake_codes['email'] = lambda x:  fake.email()\n",
    "fake_codes['job'] = lambda x:  fake.job()\n",
    "fake_codes['ipv4'] = lambda x:  fake.ipv4()\n",
    "fake_codes['language'] = lambda x:  fake.language_code()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "_______"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "# Fake Classification Tasks\n",
    "\n",
    "For classification tasks. The purpose of this function is to return a fake data set that can be used for classification. \n",
    "\n",
    "\n",
    "You pass in a list in the form \n",
    "\n",
    "    [(('col_name','fakercode',(min,max,mu))]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "______"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def fake_data_classification(dataname='myfakedata',nsamples=1,\n",
    "                             nclass=2,datacode=[],target_name=\"target\",pct_pos=0.5,std=2):\n",
    "    \"\"\"\n",
    "    INPUT: Takes in a string of the dataname, and a dictionary of column names along with\n",
    "           what they represent using the Faker() library.\n",
    "    OUTPUT: Prints the head of the dataframe and also saves it to a csv file.\n",
    "    \"\"\"\n",
    "    print(\"Generating Classification Data...\")\n",
    "    # Create Data\n",
    "    data = datasets.make_blobs(n_samples=nsamples, n_features=len(datacode), \n",
    "                           centers=nclass, cluster_std=std)\n",
    "\n",
    "    # Convert to DataFrames with normalized numbers\n",
    "    features = pd.DataFrame(normalize(data[0])).apply(lambda x: x+1)\n",
    "    features = pd.DataFrame(data[0])\n",
    "    \n",
    "    # Target\n",
    "    target =  pd.DataFrame(data[1])\n",
    "    target.columns = [target_name]\n",
    "    \n",
    "    print(\"Running Faker Code\")\n",
    "    for colind,(col_name,code,minmax,rounder) in enumerate(datacode):\n",
    "        \n",
    "        # Set Data to be Numerically Realistic\n",
    "        features[colind] = realizer(features[colind],minmax,rounder)\n",
    "        # Rename the column as required\n",
    "        features=features.rename(columns = {colind:col_name})\n",
    "        \n",
    "        # Check to see if there is fake data to be generated!\n",
    "        if code != 'none':\n",
    "            features[col_name] = features[col_name].apply(fake_codes[code])\n",
    "        \n",
    "    ratio = pct_pos*(2/0.5)\n",
    "\n",
    "    print(\"Completed Faker Generation\")\n",
    "    print(\"Saving as \"+dataname+\".csv\")\n",
    "    final_data = pd.concat([features,target],axis=1)\n",
    "    \n",
    "    # Fix issues with \n",
    "    num_pos = len(final_data[final_data[target_name]==1])\n",
    "    positives = final_data[final_data[target_name]==1].sample(np.round(num_pos*ratio))\n",
    "    negatives = final_data[final_data[target_name]==0]\n",
    "    \n",
    "    final_data = pd.concat([positives,negatives])\n",
    "    final_data.to_csv(dataname+\".csv\",index=False)\n",
    "    final_pos = len(final_data[final_data[target_name]==1])\n",
    "    final_neg = len(final_data[final_data[target_name]==0])\n",
    "    print(\"The number of positive points are: {}\".format(final_pos))\n",
    "    print(\"The number of negative points are: {}\".format(final_neg))\n",
    "    print(\"Percent of positive points is: {}\".format(final_pos/(final_neg+final_pos)))\n",
    "\n",
    "    \n",
    "    print(\"Example of DataFrame Created:\")\n",
    "    print(\"\\n\")\n",
    "    print(final_data.head())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def multi_fake_data_classification(dataname='myfakedata',nsamples=1,\n",
    "                             nclass=3,datacode=[],target_name=\"target\",std=2):\n",
    "    \"\"\"\n",
    "    INPUT: Takes in a string of the dataname, and a dictionary of column names along with\n",
    "           what they represent using the Faker() library.\n",
    "    OUTPUT: Prints the head of the dataframe and also saves it to a csv file.\n",
    "    \"\"\"\n",
    "    print(\"Generating Classification Data...\")\n",
    "    # Create Data\n",
    "    data = datasets.make_blobs(n_samples=nsamples, n_features=len(datacode), \n",
    "                           centers=nclass, cluster_std=std)\n",
    "\n",
    "    # Convert to DataFrames with normalized numbers\n",
    "    features = pd.DataFrame(normalize(data[0])).apply(lambda x: x+1)\n",
    "    features = pd.DataFrame(data[0])\n",
    "    \n",
    "    # Target\n",
    "    target =  pd.DataFrame(data[1])\n",
    "    target.columns = [target_name]\n",
    "    \n",
    "    print(\"Running Faker Code\")\n",
    "    for colind,(col_name,code,minmax,rounder) in enumerate(datacode):\n",
    "        \n",
    "        # Set Data to be Numerically Realistic\n",
    "        features[colind] = realizer(features[colind],minmax,rounder)\n",
    "        # Rename the column as required\n",
    "        features=features.rename(columns = {colind:col_name})\n",
    "        \n",
    "        # Check to see if there is fake data to be generated!\n",
    "        if code != 'none':\n",
    "            features[col_name] = features[col_name].apply(fake_codes[code])\n",
    "        \n",
    "    \n",
    "\n",
    "    print(\"Completed Faker Generation\")\n",
    "    print(\"Saving as \"+dataname+\".csv\")\n",
    "    final_data = pd.concat([features,target],axis=1)\n",
    "    \n",
    "    \n",
    "    print(\"Example of DataFrame Created:\")\n",
    "    print(\"\\n\")\n",
    "    print(final_data.describe())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def multi_fake_data_classification(dataname='myfakedata',nsamples=1,nclass=3,datacode=[],target_name=\"target\",std=2):\n",
    "    \n",
    "\n",
    "    data = datasets.make_blobs(n_samples=nsamples, n_features=len(datacode), centers=nclass, cluster_std=std)\n",
    "    return data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Data Sets for clustering"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Generating Classification Data...\n",
      "Running Faker Code\n",
      "Completed Faker Generation\n",
      "Saving as hack_data.csv\n",
      "Example of DataFrame Created:\n",
      "\n",
      "\n",
      "       Session_Connection_Time  Bytes Transferred  Kali_Trace_Used  \\\n",
      "count               500.000000         500.000000       500.000000   \n",
      "mean                 35.624000        1148.456020         0.490000   \n",
      "std                  18.874828         577.357338         0.500401   \n",
      "min                   1.000000          10.000000         0.000000   \n",
      "25%                  12.000000         462.907500         0.000000   \n",
      "50%                  46.000000        1397.335000         0.000000   \n",
      "75%                  50.000000        1610.045000         1.000000   \n",
      "max                  60.000000        2000.000000         1.000000   \n",
      "\n",
      "       Servers_Corrupted  Pages_Corrupted  WPM_Typing_Speed        hack  \n",
      "count         500.000000       500.000000         500.00000  500.000000  \n",
      "mean            6.127640         9.542000          60.86342    0.998000  \n",
      "std             3.035803         2.442164          11.42301    0.816903  \n",
      "min             1.000000         3.000000          40.00000    0.000000  \n",
      "25%             2.242500         8.000000          46.44000    0.000000  \n",
      "50%             7.610000        10.000000          67.36500    1.000000  \n",
      "75%             8.472500        11.000000          69.43500    2.000000  \n",
      "max            10.000000        15.000000          75.00000    2.000000  \n"
     ]
    }
   ],
   "source": [
    "# Hacker Data Set\n",
    "base = [\n",
    "    (\"Session_Connection_Time\",'none',(1,60),'yes'),\n",
    "    (\"Bytes Transferred\",'none',(10,2000),''),\n",
    "    (\"Kali_Trace_Used\",'binary',(0,1),''),\n",
    "    (\"Servers_Corrupted\",'none',(1,10),''),\n",
    "    (\"Pages_Corrupted\",'none',(3,15),'yes'),\n",
    "    (\"Location\",\"country\",(0,1),''),\n",
    "    (\"WPM_Typing_Speed\",'none',(40,75),'')\n",
    "]\n",
    "result = multi_fake_data_classification('hack_data',nsamples=500,nclass=3,datacode=base,\n",
    "                                  target_name=\"hack\", std=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(array([[-1.1381536 ,  4.92688901, -5.85879158, ...,  1.54835818,\n",
       "         -7.65706875,  8.14730731],\n",
       "        [-3.61286444, -2.56854496, -5.56765664, ...,  7.49127328,\n",
       "          5.58157365, -5.00669104],\n",
       "        [-0.27161734, -9.7048933 , -2.5822295 , ...,  0.49970095,\n",
       "          1.37105124,  2.75386952],\n",
       "        ..., \n",
       "        [-1.26937462, -8.3583158 , -3.81481804, ..., -0.87315573,\n",
       "          2.00820047,  2.11995745],\n",
       "        [-2.07290113,  3.20194004, -4.15905688, ..., -0.61489482,\n",
       "         -7.15047156,  7.37089153],\n",
       "        [-1.34311186, -8.36161986, -2.98110887, ...,  0.75193482,\n",
       "         -0.03669774,  5.38140152]]),\n",
       " array([1, 2, 0, 0, 0, 1, 0, 2, 0, 2, 0, 0, 0, 0, 2, 2, 0, 2, 1, 0, 0, 2, 2,\n",
       "        0, 1, 1, 0, 1, 0, 0, 0, 2, 1, 0, 2, 1, 0, 0, 2, 2, 0, 0, 2, 0, 2, 1,\n",
       "        2, 0, 2, 1, 0, 2, 1, 0, 0, 1, 0, 0, 2, 2, 0, 1, 2, 1, 1, 1, 2, 0, 0,\n",
       "        1, 2, 1, 2, 2, 0, 0, 2, 2, 0, 2, 0, 2, 2, 0, 0, 1, 1, 0, 1, 0, 0, 0,\n",
       "        2, 0, 0, 1, 2, 2, 1, 2, 0, 0, 0, 0, 1, 0, 2, 1, 2, 1, 2, 1, 0, 1, 1,\n",
       "        2, 1, 1, 1, 2, 0, 0, 0, 2, 1, 2, 1, 1, 2, 0, 2, 1, 2, 1, 2, 1, 1, 1,\n",
       "        0, 0, 2, 2, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 2, 0, 0, 0, 0, 0, 2,\n",
       "        2, 1, 2, 2, 0, 1, 2, 0, 2, 0, 0, 0, 2, 1, 0, 1, 0, 0, 0, 0, 2, 2, 2,\n",
       "        1, 0, 0, 0, 0, 2, 1, 2, 0, 1, 1, 0, 1, 0, 0, 0, 0, 2, 1, 2, 1, 1, 2,\n",
       "        1, 2, 0, 1, 2, 1, 2, 1, 1, 1, 2, 0, 1, 0, 1, 1, 1, 1, 1, 1, 2, 0, 2,\n",
       "        0, 1, 2, 1, 1, 2, 0, 2, 2, 0, 2, 0, 0, 2, 0, 2, 0, 0, 2, 2, 2, 1, 1,\n",
       "        2, 2, 1, 2, 2, 2, 1, 2, 2, 0, 1, 1, 1, 1, 0, 1, 2, 0, 1, 1, 1, 1, 1,\n",
       "        0, 0, 0, 0, 2, 1, 1, 0, 2, 2, 1, 1, 0, 0, 1, 2, 1, 0, 0, 1, 1, 1, 2,\n",
       "        0, 1, 1, 0, 2, 0, 1, 0, 1, 0, 2, 1, 1, 1, 2, 1, 0, 2, 2, 1, 1, 0, 1,\n",
       "        0, 2, 2, 2, 2, 2, 1, 0, 1, 2, 1, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 1, 2,\n",
       "        1, 2, 1, 2, 1, 0, 1, 2, 0, 0, 2, 1, 0, 1, 0, 2, 2, 1, 0, 1, 1, 1, 0,\n",
       "        0, 1, 0, 1, 1, 0, 0, 2, 2, 2, 2, 2, 1, 2, 1, 2, 2, 1, 2, 0, 1, 1, 2,\n",
       "        2, 2, 1, 2, 2, 1, 2, 1, 1, 2, 0, 0, 2, 2, 1, 0, 1, 1, 0, 0, 1, 2, 2,\n",
       "        0, 1, 2, 0, 0, 2, 2, 0, 1, 0, 2, 0, 2, 1, 1, 0, 1, 1, 1, 2, 2, 2, 2,\n",
       "        0, 0, 1, 2, 2, 2, 0, 1, 1, 2, 1, 0, 2, 1, 1, 2, 1, 1, 0, 0, 2, 0, 2,\n",
       "        1, 0, 2, 0, 0, 0, 0, 2, 1, 0, 2, 2, 1, 1, 1, 1, 2, 0, 1, 1, 0, 2, 0,\n",
       "        1, 2, 0, 1, 2, 0, 2, 2, 0, 1, 1, 1, 2, 0, 0, 1, 0]))"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Session_Connection_Time</th>\n",
       "      <th>Bytes Transferred</th>\n",
       "      <th>Kali_Trace_Used</th>\n",
       "      <th>Servers_Corrupted</th>\n",
       "      <th>Pages_Corrupted</th>\n",
       "      <th>WPM_Typing_Speed</th>\n",
       "      <th>hack</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>334.000000</td>\n",
       "      <td>334.000000</td>\n",
       "      <td>334.000000</td>\n",
       "      <td>334.000000</td>\n",
       "      <td>334.000000</td>\n",
       "      <td>334.000000</td>\n",
       "      <td>334.00000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>30.008982</td>\n",
       "      <td>607.245269</td>\n",
       "      <td>0.511976</td>\n",
       "      <td>5.258503</td>\n",
       "      <td>10.838323</td>\n",
       "      <td>57.342395</td>\n",
       "      <td>0.50000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>14.088201</td>\n",
       "      <td>286.335932</td>\n",
       "      <td>0.500607</td>\n",
       "      <td>2.301907</td>\n",
       "      <td>3.063526</td>\n",
       "      <td>13.411063</td>\n",
       "      <td>0.50075</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>1.000000</td>\n",
       "      <td>10.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>6.000000</td>\n",
       "      <td>40.000000</td>\n",
       "      <td>0.00000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>18.000000</td>\n",
       "      <td>372.200000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>3.122500</td>\n",
       "      <td>8.000000</td>\n",
       "      <td>44.127500</td>\n",
       "      <td>0.00000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>31.000000</td>\n",
       "      <td>601.650000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>5.285000</td>\n",
       "      <td>10.500000</td>\n",
       "      <td>57.840000</td>\n",
       "      <td>0.50000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>42.000000</td>\n",
       "      <td>843.702500</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>7.400000</td>\n",
       "      <td>14.000000</td>\n",
       "      <td>70.577500</td>\n",
       "      <td>1.00000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>60.000000</td>\n",
       "      <td>1330.500000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>10.000000</td>\n",
       "      <td>15.000000</td>\n",
       "      <td>75.000000</td>\n",
       "      <td>1.00000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       Session_Connection_Time  Bytes Transferred  Kali_Trace_Used  \\\n",
       "count               334.000000         334.000000       334.000000   \n",
       "mean                 30.008982         607.245269         0.511976   \n",
       "std                  14.088201         286.335932         0.500607   \n",
       "min                   1.000000          10.000000         0.000000   \n",
       "25%                  18.000000         372.200000         0.000000   \n",
       "50%                  31.000000         601.650000         1.000000   \n",
       "75%                  42.000000         843.702500         1.000000   \n",
       "max                  60.000000        1330.500000         1.000000   \n",
       "\n",
       "       Servers_Corrupted  Pages_Corrupted  WPM_Typing_Speed       hack  \n",
       "count         334.000000       334.000000        334.000000  334.00000  \n",
       "mean            5.258503        10.838323         57.342395    0.50000  \n",
       "std             2.301907         3.063526         13.411063    0.50075  \n",
       "min             1.000000         6.000000         40.000000    0.00000  \n",
       "25%             3.122500         8.000000         44.127500    0.00000  \n",
       "50%             5.285000        10.500000         57.840000    0.50000  \n",
       "75%             7.400000        14.000000         70.577500    1.00000  \n",
       "max            10.000000        15.000000         75.000000    1.00000  "
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.read_csv('hack_data.csv').describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Creating Classification Sets\n",
    "\n",
    "### Binary Customer Churn\n",
    "\n",
    "A marketing agency has many customers that use their service to produce ads for the client/customer websites. They've noticed that they have quite a bit of churn in clients. They basically randomly assign account managers right now, but want you to create a machine learning model that will help predict which customers will churn (stop buying their service) so that they can correctly assign the customers most at risk to churn an account manager. Luckily they have some historical data, can you help them out? Create a classification algorithm that will help classify whether or not a customer churned. Then the company can test this against incoming data for future customers to predict which customers will churn and assign them an account manager.\n",
    "\n",
    "Here are the fields and their definitions:\n",
    "* Name : Name of the latest contact at Company\n",
    "* Age: Customer Age\n",
    "* Total_Purchase: Total Ads Purchased\n",
    "* Account_Manager: Binary 0=No manager, 1= Account manager assigned\n",
    "* Years: Totaly Years as a customer\n",
    "* Num_sites: Number of websites that use the service.\n",
    "* Onboard_date: Date that the name of the latest contact was onboarded\n",
    "* Location: Client HQ Address\n",
    "* Company: Name of Client Company"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 172,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Generating Classification Data...\n",
      "Running Faker Code\n",
      "Completed Faker Generation\n",
      "Saving as new_customers.csv\n",
      "The number of positive points are: 1\n",
      "The number of negative points are: 5\n",
      "Percent of positive points is: 0.16666666666666666\n",
      "Example of DataFrame Created:\n",
      "\n",
      "\n",
      "            Names   Age  Total_Purchase  Account_Manager  Years  Num_Sites  \\\n",
      "2   Andrew Mccall  37.0         9935.53                1   7.71        8.0   \n",
      "1  Michele Wright  23.0         7526.94                1   9.28       15.0   \n",
      "3    Jeremy Chang  65.0          100.00                1   1.00       15.0   \n",
      "4  Megan Ferguson  32.0         6487.50                0   9.40       14.0   \n",
      "5    Taylor Young  32.0        13147.71                1  10.00        8.0   \n",
      "\n",
      "         Onboard_date                                           Location  \\\n",
      "2 2011-08-29 18:37:54  38612 Johnny Stravenue\\nNataliebury, WI 15717-...   \n",
      "1 2013-07-22 18:19:54  21083 Nicole Junction Suite 332\\nYoungport, ME...   \n",
      "3 2006-12-11 07:48:13    085 Austin Views\\nLake Julialand, WY 63726-4298   \n",
      "4 2016-10-28 05:32:13     922 Wright Branch\\nNorth Cynthialand, NC 64721   \n",
      "5 2012-03-20 00:36:46                   Unit 0789 Box 0734\\nDPO AP 39702   \n",
      "\n",
      "            Company  Churn  \n",
      "2          King Ltd      1  \n",
      "1     Cannon-Benson      0  \n",
      "3  Barron-Robertson      0  \n",
      "4     Sexton-Golden      0  \n",
      "5          Wood LLC      0  \n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/marci/anaconda/lib/python3.5/site-packages/pandas/core/generic.py:2572: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future\n",
      "  locs = rs.choice(axis_length, size=n, replace=replace, p=weights)\n"
     ]
    }
   ],
   "source": [
    "base = [\n",
    "    ('Names','name',(0,1),''),\n",
    "    (\"Age\",'none',(22,65),'yes'),\n",
    "    (\"Total_Purchase\",'none',(100,20000),''),\n",
    "    (\"Account_Manager\",'binary',(0,1),''),\n",
    "    (\"Years\",'none',(1,10),''),\n",
    "    (\"Num_Sites\",'none',(3,15),'yes'),\n",
    "    (\"Onboard_date\",'timestamp',(0,1),''),\n",
    "    (\"Location\",\"address\",(0,1),''),\n",
    "    (\"Company\",\"company\",(0,1),'')\n",
    "]\n",
    "result = fake_data_classification('new_customers',nsamples=10,nclass=2,datacode=base,\n",
    "                                  target_name=\"Churn\",pct_pos=0.05,std=6.2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Check with X,y split\n",
    "*Don't edit this!*"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 134,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "             precision    recall  f1-score   support\n",
      "\n",
      "          0       0.89      0.98      0.93       243\n",
      "          1       0.81      0.46      0.59        54\n",
      "\n",
      "avg / total       0.88      0.88      0.87       297\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df = pd.read_csv(\"customer_churn.csv\")\n",
    "y = df['Churn']\n",
    "X = df[['Age',\"Total_Purchase\",\"Account_Manager\",'Years',\"Num_Sites\"]]\n",
    "print(classification_report(y_test,predictions))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Dog Food\n",
    "\n",
    "You've been hired by a dog food company to try to predict why some batches of their dog food are spoiling much quicker than intended! Unfortunately this Dog Food company hasn't upgraded to the latest machinery, meaning that the amounts of the five preservative chemicals they are using can vary a lot, but which is the chemical that has the strongest effect? The dog food company first mixes up a batch of preservative that contains 4 different preservative chemicals (A,B,C,D) and then is completed with a \"filler\" chemical. The food scientists beelive one of the A,B,C, or D preservatives is causing the problem, but need your help to figure out which one!\n",
    "\n",
    "Use Machine Learning with RF to find out which parameter had the most predicitive power, thus finding out which chemical causes the early spoiling! So create a model and then find out how you can decide which chemical is the problem!\n",
    "\n",
    "* Pres_A : Percentage of preservative A in the mix\n",
    "* Pres_B : Percentage of preservative B in the mix\n",
    "* Pres_C : Percentage of preservative C in the mix\n",
    "* Pres_D : Percentage of preservative D in the mix\n",
    "* Other: Filler chemical that is not a preservative"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 140,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Generating Classification Data...\n",
      "Running Faker Code\n",
      "Completed Faker Generation\n",
      "Saving as dog_food.csv\n",
      "The number of positive points are: 140\n",
      "The number of negative points are: 350\n",
      "Percent of positive points is: 0.2857142857142857\n",
      "Example of DataFrame Created:\n",
      "\n",
      "\n",
      "     A  B     C  D  Spoiled\n",
      "108  4  2  12.0  3        1\n",
      "355  5  6  12.0  7        1\n",
      "24   6  2  13.0  6        1\n",
      "598  4  2  12.0  1        1\n",
      "506  4  2  12.0  3        1\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/marci/anaconda/lib/python3.5/site-packages/pandas/core/generic.py:2572: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future\n",
      "  locs = rs.choice(axis_length, size=n, replace=replace, p=weights)\n"
     ]
    }
   ],
   "source": [
    "base = [\n",
    "     ('A','randint1to10',(0,1),''),\n",
    "    ('B','randint1to10',(0,1),''),\n",
    "    ('C','none',(5,15),'yes'),\n",
    "    ('D','randint1to10',(0,1),''),\n",
    "]\n",
    "\n",
    "result = fake_data_classification('dog_food2',nsamples=700,nclass=2,datacode=base,\n",
    "                                  target_name=\"Spoiled\",pct_pos=0.1,std=3.2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 146,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "             precision    recall  f1-score   support\n",
      "\n",
      "          0       0.99      1.00      1.00       114\n",
      "          1       1.00      0.98      0.99        48\n",
      "\n",
      "avg / total       0.99      0.99      0.99       162\n",
      "\n",
      "Index(['A', 'B', 'C', 'D', 'Spoiled'], dtype='object')\n",
      "[ 0.04022167  0.05269918  0.8641571   0.04292205]\n"
     ]
    }
   ],
   "source": [
    "df = pd.read_csv(\"dog_food.csv\")\n",
    "y = df['Spoiled']\n",
    "X = df[['A',\"B\",\"C\",'D']]\n",
    "print(classification_report(y_test,predictions))\n",
    "print(df.columns)\n",
    "print(rfc.feature_importances_)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Test Validity with RF"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 142,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.metrics import classification_report"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 143,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "             precision    recall  f1-score   support\n",
      "\n",
      "          0       0.99      1.00      1.00       114\n",
      "          1       1.00      0.98      0.99        48\n",
      "\n",
      "avg / total       0.99      0.99      0.99       162\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# X = df.drop('Class'.as_matrix()\n",
    "# y = np.ravel(target.as_matrix())\n",
    "X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.33)\n",
    "rfc = RandomForestClassifier(200)\n",
    "rfc.fit(X_train,y_train)   \n",
    "predictions = rfc.predict(X_test)\n",
    "print(classification_report(y_test,predictions))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Cancer Detection\n",
    "\n",
    "Just use breast cancer data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Possible regression task with Facebook Data!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 149,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df = pd.read_csv('DataSets/dataset_Facebook.csv',sep=';')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "_______\n",
    "\n",
    "______"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Regression Tasks"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def fake_data_regression(dataname='myfakedata',nsamples=1,datacode=[],target_name=\"target\"):\n",
    "    \"\"\"\n",
    "    INPUT: Takes in a string of the dataname, and a dictionary of column names along with\n",
    "           what they represent using the Faker() library.\n",
    "    OUTPUT: Prints the head of the dataframe and also saves it to a csv file.\n",
    "    \"\"\"\n",
    "    print(\"Generating Classification Data...\")\n",
    "    std = uniform(1,2)\n",
    "    print(\"Random std value was: \"+str(std))\n",
    "    # Create Data\n",
    "    data = datasets.make_regression(n_samples=nsamples, n_features=len(datacode), \n",
    "                                    n_informative=len(datacode), n_targets=1,noise=0.2)\n",
    "    \n",
    "    # Convert to DataFrames with normalized numbers\n",
    "    features = pd.DataFrame(normalize(data[0])).apply(lambda x: x+1)\n",
    "    features = pd.DataFrame(data[0])\n",
    "    \n",
    "    # Target\n",
    "    target =  pd.DataFrame(data[1])\n",
    "    target.columns = [target_name]\n",
    "    \n",
    "    print(\"Running Faker Code\")\n",
    "    for colind,(col_name,code,minmax) in enumerate(datacode):\n",
    "        \n",
    "        # Set Data to be Numerically Realistic\n",
    "        features[colind] = realizer(features[colind],minmax)\n",
    "        # Rename the column as required\n",
    "        features=features.rename(columns = {colind:col_name})\n",
    "        \n",
    "        # Check to see if there is fake data to be generated!\n",
    "        if code != 'none':\n",
    "            features[col_name] = features[col_name].apply(fake_codes[code])\n",
    "        \n",
    "        \n",
    "    print(\"Completed Faker Generation\")\n",
    "    print(\"Saving as \"+dataname+\".csv\")\n",
    "    final_data = pd.concat([features,target],axis=1)\n",
    "    final_data.to_csv(dataname+\".csv\")\n",
    "    print(\"Example of DataFrame Created:\")\n",
    "    print(\"\\n\")\n",
    "    print(final_data.head())\n",
    "    return final_data\n",
    "                           "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "fake_data_regression()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "_______"
   ]
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [default]",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
